pip install plotly
import numpy as np
import pandas as pd
import json
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import plotly.graph_objects as go
import datetime as dt
import plotly.express as px
import math
from PIL import Image
import glob
import seaborn as sns
import re
import os
import seaborn as sns
%matplotlib inline
CA =pd.read_csv('CAvideos.csv')
US =pd.read_csv('USvideos.csv')
FR =pd.read_csv('FRvideos.csv')
GB =pd.read_csv('GBvideos.csv')
IN =pd.read_csv('INvideos.csv')
CA['country']= 'CA'
US['country']= 'US'
FR['country']= 'FR'
GB['country']= 'GB'
IN['country']= 'IN'
df=pd.concat([CA,US,FR,GB,IN])
df.to_csv('ALL.csv')
df.head()
df['category_id'] = df['category_id'].astype(str)
category_id = {}
with open('US_category_id.json', 'r') as f:
data = json.load(f)
for category in data['items']:
category_id[category['id']] = category['snippet']['title']
df.insert(4, 'category', df['category_id'].map(category_id))
category_list = df['category'].unique()
category_list
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
df['publish_time'] = pd.to_datetime(df['publish_time'], infer_datetime_format=True)
df['publish_date'] = df['publish_time'].dt.date
df['publish_wd'] = df['publish_time'].dt.weekday
df['publish_hr'] = df['publish_time'].dt.hour
df['publish_time'] = df['publish_time'].dt.time
df.head()
df = df.drop(['tags', 'video_error_or_removed', 'description'],axis = 1)
df = df.drop_duplicates(keep = 'first')
df.info()
pd.options.display.float_format = '{:.2f}'.format
df.describe()
#create a category_name column to match the category_id
US['category_name'] = np.nan
US.loc[(US["category_id"] == 1),"category_name"] = 'Film and Animation'
US.loc[(US["category_id"] == 2),"category_name"] = 'Cars and Vehicles'
US.loc[(US["category_id"] == 10),"category_name"] = 'Music'
US.loc[(US["category_id"] == 15),"category_name"] = 'Pets and Animals'
US.loc[(US["category_id"] == 17),"category_name"] = 'Sport'
US.loc[(US["category_id"] == 19),"category_name"] = 'Travel and Events'
US.loc[(US["category_id"] == 20),"category_name"] = 'Gaming'
US.loc[(US["category_id"] == 22),"category_name"] = 'People and Blogs'
US.loc[(US["category_id"] == 23),"category_name"] = 'Comedy'
US.loc[(US["category_id"] == 24),"category_name"] = 'Entertainment'
US.loc[(US["category_id"] == 25),"category_name"] = 'News and Politics'
US.loc[(US["category_id"] == 26),"category_name"] = 'How to and Style'
US.loc[(US["category_id"] == 27),"category_name"] = 'Education'
US.loc[(US["category_id"] == 28),"category_name"] = 'Science and Technology'
US.loc[(US["category_id"] == 29),"category_name"] = 'Non Profits and Activism'
US.loc[(US["category_id"] == 25),"category_name"] = 'News & Politics'
US['likes_log'] = np.log(US['likes'] + 1)
US['views_log'] = np.log(US['views'] + 1)
US['dislikes_log'] = np.log(US['dislikes'] + 1)
US['comment_log'] = np.log(US['comment_count'] + 1)
plt.figure(figsize = (24,12))
plt.subplot(221)
g1 = sns.distplot(US['views_log'])
g1.set_title("Views log distribution", fontsize=16)
sns.set(font_scale=1)
plt.subplot(224)
g2 = sns.distplot(US['likes_log'], color='green')
g2.set_title('Likes log distribution', fontsize=16)
sns.set(font_scale=1)
plt.subplot(223)
g3 = sns.distplot(US['dislikes_log'], color='red')
g3.set_title("Dislikes log distribution", fontsize=16)
sns.set(font_scale=1)
plt.subplot(222)
g4 = sns.distplot(US['comment_log'], color='orange')
g4.set_title("Comments log distribution", fontsize=16)
sns.set(font_scale=1)
plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 0.9)
plt.show()
#make a dataframe for category_name and number of the videos
US_count = US.category_name.value_counts().rename_axis('category_name').reset_index(name='counts')
US_count
#bar chart for the category_name and counts
plt.subplots(figsize=(40,20))
sns.barplot(x='counts', y='category_name', data = US_count).set(title='US trending video number by category')
sns.set(font_scale=5)
#display views distribution by category names
plt.figure(figsize = (20,6))
g = sns.boxplot(x='category_name', y='views_log', data=US)
g.set_xticklabels(g.get_xticklabels(),rotation=45, size=15)
sns.set(font_scale=1)
g.set_title('Views Distribution by Category Names', fontsize=15)
g.set_xlabel('',fontsize=15)
g.set_ylabel('Views(log)',fontsize=15)
plt.show()
#calculate the ratio
US['like_rate'] = US['likes'] / US['views'] * 100
US['dislike_rate'] = US['dislikes'] / US['views'] * 100
US['comment_rate'] = US['comment_count'] / US['views'] * 100
#display like rate distribution
plt.figure(figsize = (20,6))
g = sns.boxplot(x='category_name', y='like_rate', data=US)
sns.set(font_scale=1)
g.set_xticklabels(g.get_xticklabels(),rotation=45, size=15)
g.set_title('Like Rate Distribution', fontsize=15)
g.set_xlabel("", fontsize=12)
g.set_ylabel('Like rate', fontsize=12)
plt.show()
#display dislike rate distribution
plt.figure(figsize = (20,6))
g = sns.boxplot(x='category_name', y='dislike_rate', data=US)
sns.set(font_scale=1)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title('Dislike Rate Distribution', fontsize=15)
g.set_xlabel('', fontsize=12)
g.set_ylabel('Dislike rate', fontsize=12)
plt.show()
#display comment rate distribution
plt.figure(figsize = (20,6))
g = sns.boxplot(x='category_name', y='comment_rate', data=US)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
sns.set(font_scale=1)
g.set_title('Comment Rate Distribution', fontsize=15)
g.set_xlabel('', fontsize=12)
g.set_ylabel('Comment rate', fontsize=12)
plt.show()
#check the publish year in datasets
US['publish_year']=US['publish_time'].astype(str)
US['publish_year']=US['publish_year'].str[:4]
US.publish_year.value_counts()
US.head(2)
#since only 2017 and 2018 have more data, we are going to do more research for 2017 and 2018
US_2017 = US[US['publish_year']=='2017']['category_name'].reset_index()
US_2017=US_2017.category_name.value_counts().rename_axis('category_name').reset_index(name='counts')
US_2017['year'] = '2017'
US_2017.head()
US_2018 = US[US['publish_year']=='2018']['category_name'].reset_index()
US_2018=US_2018.category_name.value_counts().rename_axis('category_name').reset_index(name = 'counts')
US_2018['year']='2018'
US_2018.head()
US_year = pd.concat([US_2017,US_2018])
plt.subplots(figsize=(40,20))
sns.barplot(x='counts', y='category_name', hue='year', data=US_year,palette="Blues").set(title='number of videos by category and years')
sns.set(font_scale=1)
number_growth = US_2018.groupby('category_name')['counts'].agg('sum')/US_2017.groupby('category_name')['counts'].agg('sum')-1
number_growth = number_growth.sort_values(ascending=False).reset_index()
number_growth.columns = ['category_name', 'growth_rate']
number_growth
plt.figure(figsize=(20,6))
g = sns.barplot(x='category_name',y='growth_rate',data=number_growth)
g.set_xticklabels(g.get_xticklabels(),rotation=45)
g.set_title("Growth rate by video category ", fontsize=15)
g.set_xlabel('category_name',fontsize=15)
g.set_ylabel('growth_rate',fontsize=12)
plt.show()
#A.Ratio of trending videos in five countries
labels = df.groupby(['country']).count().index
sizes = df.groupby(['country']).count()['title']
explode = (0, 0, 0, 0.1, 0) # only "explode" the 2nd slice (i.e. 'Hogs')
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0, 1, 8)]
plt.subplots(figsize=(10,10))
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, explode=explode, startangle=100,colors=colors)
plt.show()
#B. Correlation of trending videos between countries
fre_df = pd.DataFrame(df.groupby(['video_id','country']).count()['title'].sort_values(ascending=False)).reset_index()
fre_df.head(), fre_df.tail()
video_list,max_list = list(),list()
country_list = df.groupby(['country']).count().index
for c in country_list:
video_list.append(fre_df[fre_df['country']==c]['title'].value_counts().sort_index())
max_list.append(max(fre_df[fre_df['country']==c]['title'].value_counts().sort_index().index))
fig, [ax0, ax1, ax2, ax3, ax4] = plt.subplots(nrows=5,figsize=(15, 20),)
st = fig.suptitle("How long a video trend in different countries?", fontsize=20)
st.set_y(0.9)
for i, pt in enumerate([ax0, ax1, ax2, ax3, ax4]):
pt.plot(video_list[i].index, video_list[i])
pt.spines['right'].set_visible(False)
pt.spines['top'].set_visible(False)
pt.set_xlabel("appearances",fontsize=14)
pt.set_ylabel(country_list[i],fontsize=24)
pt.axes.set_xlim(1, 30)
# Tweak spacing between subplots to prevent labels from overlapping
plt.subplots_adjust(hspace=0.2)
plt.subplots_adjust(wspace=0)
#C. What is the overall ratio of Likes-Dislikes in different categories?
like_dislike_ratio = df.groupby('category')['likes'].agg('sum') / df.groupby('category')['dislikes'].agg('sum')
like_dislike_ratio = like_dislike_ratio.sort_values(ascending=False).reset_index()
like_dislike_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=like_dislike_ratio,
label="Likes-Dislikes Ratio")
#What is the overall ratio of Views-Comments in different categories?
views_comment_ratio = df.groupby('category')['views'].agg('sum') / df.groupby('category')['comment_count'].agg('sum')
views_comment_ratio = views_comment_ratio.sort_values(ascending=False).reset_index()
views_comment_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=views_comment_ratio,
label="Views-Comments Ratio")
#What is the overall ratio of Likes-Views Ratio in different categories?
view_like_ratio = df.groupby('category')['likes'].agg('sum') / df.groupby('category')['views'].agg('sum')
view_like_ratio = view_like_ratio.sort_values(ascending=False).reset_index()
view_like_ratio.columns = ['category','ratio']
plt.subplots(figsize=(10, 15))
sns.barplot(x="ratio", y="category", data=view_like_ratio,
label="Views-Likes Ratio")
#D. what is the categories of trending videos in US/CA/GB/FR/IN
cat_df_us = df[df['country']=='US']['category'].value_counts().reset_index()
cat_df_us['country']='US'
cat_df_ca = df[df['country']=='CA']['category'].value_counts().reset_index()
cat_df_ca['country']='CA'
cat_df_gb = df[df['country']=='GB']['category'].value_counts().reset_index()
cat_df_gb['country']='GB'
cat_df_fr = df[df['country']=='FR']['category'].value_counts().reset_index()
cat_df_fr['country']='FR'
cat_df_in = df[df['country']=='IN']['category'].value_counts().reset_index()
cat_df_in['country']='IN'
cat_df_all=pd.concat([cat_df_us,cat_df_ca,cat_df_gb,cat_df_fr,cat_df_in])
cat_df_all.head()
sns.barplot(x='category', y='index', hue='country', data=cat_df_all,palette="Blues",linewidth = 20)
sns.factorplot(x='category',y='country',col='index',col_wrap=4, kind='bar', data=cat_df_all, palette="RdPu")
# E overall correlation of numeric variables
corr_list = df[['views','likes','dislikes','comment_count']]
plt.figure(figsize=(15,10))
ax = sns.heatmap(data=corr_list.corr(),annot=True)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
# The like to dislike ratio to measure of the viewers' approval of the video
df["ldratio"] = df["likes"]/df["dislikes"]
# These variables record the extent to which people react to the video.
df["perc_comment"] = df["comment_count"] / df["views"]
df["perc_reaction"] = (df["likes"] + df["dislikes"]) / df["views"]
# The Numbers Over Time
# Let's visualize views, like to dislike ratio, and more over time using the trending date.
def over_time(df, var):
averages = df[df["dislikes"] != 0].groupby("trending_date").mean()
plt.plot(averages.index.values, averages[var])
plt.xticks(rotation = 90)
plt.xlabel("Date")
plt.ylabel(f"Average {var}")
plt.title(f"Average {var} Over Time (11/14/17 - 6/14/18)")
plt.show()
over_time(df, "views")
# Views per trending video appeared to skyrocket beginning around February of 2018.
over_time(df, "ldratio")
# Some event caused the average like to dislike ratio on trending videos to decrease dramatically around January of 2018.
over_time(df, "perc_reaction") #Recall perc_reaction is (likes + dislikes) / views
#There was a large increase in people who liked and disliked trending videos in December of 2017, and a large decrease in May of 2018.
over_time(df, "perc_comment") #Recall perc_comment is comments / views
#The percent of people who comment on trending videos has been quite volatile, though exhibits similar patterns as our "perc_reation" chart (May, 2018 for example).
# What publishing time receives the most views?
by_hour = df.groupby("publish_hr").mean()
plt.plot(by_hour.index.values, by_hour["views"])
plt.scatter(by_hour.index.values, by_hour["views"])
plt.xlabel("Publish Hour of the Day")
plt.ylabel("Average Number of Views")
plt.title("Average Amount of Views on Trending Videos by the Hour")
plt.show()
Videos published at 4 AM received the most views on average. This may be due to the fact that people are first waking up after this time and have all day to make the video popular. 9 AM is also a good time to puublish a video when hoping for many views on the trending list. Trending videos published later in the evening usually aren't as viewed.
data = df['publish_wd'].map(dict(zip(range(7),
['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']))).value_counts()
fig1 = go.Figure(data=[go.Bar(x=data.index.values, y=data, textposition='auto')])
fig1.update_layout(title="Number of Videos Published in Weekday", yaxis=dict(title="Videos"))
fig1.show()
most of the videos are published on weekdays instead of saturday and sunday which can increase the chances of more views.
data = df[['title', 'channel_title', 'category_id', 'views', 'publish_wd',
'publish_hr', 'likes', 'dislikes','country']].loc[df.views > 20000000].reset_index()
data.publish_wd = data.publish_wd.map(dict(zip(range(7),
['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])))
def bubble_plt(target, plot_title, target_title, data):
hover_text = []
bubble_size = []
for index, row in data.iterrows():
hover_text.append(('Title: {title}<br>'+
'Category: {category_id}<br>'+
'Channel: {channel_title}<br>'+
'Views: {views}<br>'+
'Likes: {likes}<br>'+
'Dislikes: {dislikes}<br>'+
'country: {country}<br>'
).format(title=row['title'],
channel_title=row['channel_title'],
category_id=row['category_id'],
views = row['views'],
likes = row['likes'],
dislikes = row['dislikes'],
country = row['country']))
bubble_size.append(row[target]/row['views'])
data['text'] = hover_text
data['size'] = bubble_size
fig2 = go.Figure()
weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thurday', 'Friday', 'Saturday', 'Sunday']
wd_data = {wd:data.query("publish_wd == '%s'"%wd)
for wd in weekday}
#Create Figure
for key, values in wd_data.items():
fig2.add_trace(go.Scatter(
x=values['views'], y=values[target]/values['views'],
name=key, text=values['text'],
marker_size=values['size'],
))
# The following formula is recommended by https://plotly.com/python/bubble-charts/
sizeref = 2.*max(data['size'])/(1000)
fig2.update_traces(mode='markers', marker=dict(sizemode='area',sizeref=sizeref, line_width=2))
fig2.update_layout(
title=plot_title,
xaxis=dict(
title='Number of views in millions',
gridcolor='white',
type='log',
gridwidth=2,
),
yaxis=dict(
title=target_title,
gridcolor='white',
gridwidth=2,
),
paper_bgcolor='rgb(243, 243, 243)',
plot_bgcolor='rgb(243, 243, 243)',
legend = {'itemsizing': 'constant'}
)
fig2.show()
bubble_plt('likes',"like/view Ratio vs. Number of views", "Like/view Ratio", data)
bubble_plt('dislikes', "Dislikes/view ratio vs. Number of views", "Dislikes/view Ratio",data)
Create a dataframe for modeling and a column 'day_to_trend' for number of days a video takes to get on the trending list
new_data = df.loc[(df.comments_disabled) &
(~df.ratings_disabled)].copy()
new_data['day_to_trend'] = abs(np.subtract(new_data.trending_date.dt.date,new_data.publish_date,dtype=np.float32)
.apply(lambda x: x.days))
left_vars = ['views','likes','dislikes','comment_count','publish_wd','publish_hr','day_to_trend','title']
new_data = new_data[left_vars]
new_data.reset_index(inplace=True)
new_data.head()
Distribution Check
from pandas.plotting import scatter_matrix
scatter_matrix(new_data[['publish_wd', 'publish_hr', 'day_to_trend']])
plt.show()
plt.hist(new_data['day_to_trend'])
plt.title("Histogram of Original Days to Trend")
plt.show()
The bar graph shows none of them follow Gaussian distribution, but seem to follow gamma distribution. Besides, days to trend only cluster at one location, so need to narrow down into two weeks(14 days)
new_data_14 = new_data.loc[new_data.day_to_trend <= 14]
plt.hist(new_data_14['day_to_trend'])
plt.title("Histogram of Days to Trend After Removing values > 7")
plt.show()
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
new_data.day_to_trend = new_data.day_to_trend <= 7
RF model defining
def rf_model(X, y, my_pg = None):
#perform Grid-search
if my_pg is None:
# tuning the hyperparameters to optimize:max depth of a tree and number of trees.
my_pg={
'max_depth': range(5,10), # fit each decision tree with depth ranging from 5 to 10 in the forest.
'n_estimators': range(155,170), # choose number of trees ranging from 155 to 170 in the forest.
}
gsc = GridSearchCV(
estimator=RandomForestClassifier(),
param_grid = my_pg,cv=5, scoring='accuracy', verbose=0, n_jobs=-1)
grid_result = gsc.fit(X,y)
return grid_result.best_params_,grid_result.best_score_
Split the dataset to train set (70%) and test set (30%)
X = new_data[['views', 'likes', 'dislikes', 'comment_count', 'publish_wd', 'publish_hr']]
y = new_data['day_to_trend']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=4,test_size=.3)
print(rf_model(X_train,y_train)) #({'max_depth':9, 'n_estimators': 168}, 0.9398826708852417)
RF classification modeling
from sklearn.metrics import classification_report as cr
rfc = RandomForestClassifier(max_depth = 9, n_estimators = 168, oob_score = True, warm_start = True)
rfc.fit(X_train, y_train)
Out of Bag score
print(rfc.oob_score_) # 0.9383350462487153
print(rfc.score(X_test,y_test)) # 0.9436450839328537
importances of 'views', 'likes', 'dislikes', 'comment_count', 'publish_wd', and 'publish_hr'
print(rfc.feature_importances_)
print(pd.crosstab(pd.Series(y_train, name='Actual'), pd.Series(rfc.predict(X_train),name='predicted_train')))
pred = rfc.predict(X_train)
print(cr(y_train, pred)) # 97% accuracy
print(pd.crosstab(pd.Series(y_test,name='Actual'), pd.Series(rfc.predict(X_test), name='predicted_test')))
pred = rfc.predict(X_test)
print(cr(y_test, pred)) # 94% accuracy
# pip install scikit-plot
import scikitplot as skplt
from sklearn.metrics import average_precision_score, plot_precision_recall_curve
prob = rfc.predict_proba(X_test)
myplot = skplt.metrics.plot_roc(y_test, prob)
average_precision = average_precision_score(y_test, prob[:,1])
disp = plot_precision_recall_curve(rfc, X_test, y_test)
disp.ax_.set_title('2-class Precision-Recall curve:'
'AP={0:0.2f}'.format(average_precision))
score = metrics.f1_score(np.array(y_test),pred)
print('The f1 score for this model is {}'.format(score))